1 OECD 국가

Wikipedia OECD 웹사이트에서 먼저 OECD 국가명을 추출한다.

library(tidyverse)
library(rvest)

oecd_countries <- read_html("https://en.wikipedia.org/wiki/OECD") %>% 
  html_nodes(xpath = '//*[@id="mw-content-text"]/div[1]/table[5]') %>% 
  html_table() %>% 
  .[[1]]

oecd_df <- oecd_countries %>% 
  as_tibble() %>% 
  janitor::clean_names() %>% 
  # mutate(id = row_number()) %>%
  select(content=country, start=membership_1, group = geographic_location) %>% 
  mutate(start = str_extract(start, "[0-9]{1,2}\\s[a-zA-Z]+\\s[0-9]{4}")) %>% 
  mutate(start = lubridate::dmy(start)) 

oecd_df
# A tibble: 37 x 3
   content        start      group        
   <chr>          <date>     <chr>        
 1 Australia      1971-06-07 Oceania      
 2 Austria        1961-09-29 Europe       
 3 Belgium        1961-09-13 Europe       
 4 Canada         1961-04-10 North America
 5 Chile          2010-05-07 South America
 6 Colombia       2020-04-28 South America
 7 Czech Republic 1995-12-21 Europe       
 8 Denmark        1961-05-30 Europe       
 9 Estonia        2010-12-09 Europe       
10 Finland        1969-01-28 Europe       
# ... with 27 more rows

OECD 가입순서대로 대륙별로 타임라인을 잡아보자. timeviz 팩키지를 통해 시간순으로 OECD 가입국가를 살펴봄으로서 대략적인 추세를 확인할 수 있다. 특히 대륙별로 가입순서를 나눠 보는 것도 흥미로운 사항을 파악할 수 있을 것이다.

library(timevis)

location <- oecd_df %>% 
  count(group) %>% 
  select(id = group) %>% 
  mutate(content = id)
  
timevis(oecd_df, groups = location, options = list(stack = FALSE)) %>% 
  setOptions(list(editable = TRUE)) %>%
  setSelection("South Korea") %>%
  fitWindow(list(animation = FALSE))

2 OECD 국가 비교

OECD 국가별 비교를 위해서 다양한 지표를 활용하여 평가가 이루지고 있다. 부폐관련 지표, 민주화 관련 지표, 국가 안정성 지표 등 다양한 지표를 통해 국가별 비교 작업을 수행할 수 있다. OECD 국가별 결측값이 일부 있어 이를 구글링을 통해 위키백과사전에 실린 오류를 수정하여 최대한 국가를 살리도록 한다.

oecd_fact <- read_html("https://en.wikipedia.org/wiki/OECD") %>% 
  html_nodes(xpath = '//*[@id="mw-content-text"]/div[1]/table[6]') %>% 
  html_table() %>% 
  .[[1]] %>% 
  as_tibble() %>% 
  janitor::clean_names()

oecd_fact_df <- oecd_fact %>% 
  set_names(c("country", "area", "population", "gdp", 
              "gdp_per_capita", "income_inequality", 
              "hdi", "fsi", "rli", "cpi", "ief", 
              "gpi", "wpfi", "di")) %>% 
  mutate(income_inequality = ifelse(income_inequality == "N/A", NA, income_inequality),
         fsi = ifelse(fsi == "N/A", NA, fsi),
         rli = ifelse(rli == "N/A", NA, rli),
         gpi = ifelse(gpi == "N/A", NA, gpi)) %>% 
  select(-rli) %>%  ## 결측값이 8개국
  mutate(income_inequality = ifelse(country == "New Zealand", 33, income_inequality),
         income_inequality = ifelse(country == "Poland", 31.8, income_inequality),
         fsi = ifelse(country == "Israel", 75.1, fsi)) %>% 
  filter(!str_detect(country, "Luxembourg|OECD|Country"))  %>% 
  mutate(across(.cols=area:di, parse_number))


## 변수 결측값
sapply(oecd_fact_df, function(y) sum(length(which(is.na(y))))) %>% 
  as.data.frame() %>% 
  rownames_to_column(var="country") %>% 
  as_tibble() %>% 
  set_names(c("country", "missings")) %>% 
  arrange(desc(missings))
# A tibble: 13 x 2
   country           missings
   <chr>                <int>
 1 country                  0
 2 area                     0
 3 population               0
 4 gdp                      0
 5 gdp_per_capita           0
 6 income_inequality        0
 7 hdi                      0
 8 fsi                      0
 9 cpi                      0
10 ief                      0
11 gpi                      0
12 wpfi                     0
13 di                       0
## 관측점 국가별 결측값
rowSums(is.na(oecd_fact_df)) %>% 
  as_tibble() %>% 
  bind_cols(oecd_fact_df %>% select(country)) %>% 
  rename(missings = value) %>% 
  arrange(desc(missings))
# A tibble: 36 x 2
   missings country       
      <dbl> <chr>         
 1        0 Australia     
 2        0 Austria       
 3        0 Belgium       
 4        0 Canada        
 5        0 Chile         
 6        0 Colombia      
 7        0 Czech Republic
 8        0 Denmark       
 9        0 Estonia       
10        0 Finland       
# ... with 26 more rows

정제된 데이터를 경제적인 측정지수 외에 다양한 지수를 바탕으로 일별해보자.

  • HDI: Human Development Index
  • FSI: Fragile States Index
  • CPI: Corruption Perceptions Index
  • IEF: Index of Economic Freedom
  • GPI: Global Peace Index
  • WPFI: Reporters Without Borders
  • DI: Democracy Index
oecd_fact_df %>% 
  arrange(desc(gdp)) %>% 
  DT::datatable() %>% 
  DT::formatRound(c("area", "population", "gdp", "gdp_per_capita"), digits = 0, interval = 3)

2.1 국가코드

위키백과사전에 담긴 OECD 국가정보와 World Economic Forum, IMD 국가 경쟁력 지표를 한데 묶는데 관계형 데이터베이스 모형에서 키값이 중요하기 때문에 countrycode 팩키지를 중간 다리로 삼아 최대한 정확히 국가명이 매칭되도록 준비한다.

library(countrycode)

oecd_fact_df <- oecd_fact_df %>% 
  mutate(flags = countrycode(country, origin = 'country.name', 'unicode.symbol'),
         iso3c = countrycode(country, origin = 'country.name', 'iso3c')) %>% 
  select(country, flags, iso3c, everything())

oecd_fact_df
# A tibble: 36 x 15
   country flags iso3c   area population     gdp gdp_per_capita income_inequali~
   <chr>   <chr> <chr>  <dbl>      <dbl>   <dbl>          <dbl>            <dbl>
 1 Austra~ "\U0~ AUS   7.74e6   24598933 1.19e12          48460             34.7
 2 Austria "\U0~ AUT   8.39e4    8809212 4.62e11          52398             30.5
 3 Belgium "\U0~ BEL   3.05e4   11372068 5.44e11          47840             27.7
 4 Canada  "\U0~ CAN   9.98e6   36708083 1.71e12          46705             34  
 5 Chile   "\U0~ CHL   7.56e5   18054726 4.45e11          24635             47.7
 6 Colomb~ "\U0~ COL   1.14e6   48901066 7.09e11          14507             49.7
 7 Czech ~ "\U0~ CZE   7.89e4   10591323 3.85e11          36327             25.9
 8 Denmark "\U0~ DNK   4.29e4    5769603 2.96e11          51364             28.2
 9 Estonia "\U0~ EST   4.52e4    1315480 4.18e10          31742             32.7
10 Finland "\U0~ FIN   3.38e5    5511303 2.47e11          44866             27.1
# ... with 26 more rows, and 7 more variables: hdi <dbl>, fsi <dbl>, cpi <dbl>,
#   ief <dbl>, gpi <dbl>, wpfi <dbl>, di <dbl>

3 Digital Transformation

디지털 전환(Digital Transformation) 대전환기에 WEF(World Economic Forum)에서 측정한 국가별 경쟁력 지표와 IMD 디지털 경쟁력(IMD Digital Competitiveness) 국가별 경쟁력 지표를 핵심 지표로 삼도록 한다.

3.1 weforum Global Competitiveness1

World Economic Forum, “Global Competitiveness Index 4.0”에서 보고서를 받아 국가별 핵심 정보만 추출한다.

한국 참여자

  • Korea, Republic of: Korea Development Institute
    • Joonghae Suh, Executive Director, Economic Information and Education Center
    • Youngho Jung, Head, Public Opinion Analysis Unit
    • Jiyeon Park, Senior Research Associate, Public Opinion Analysis Unit

library(pdftools)

wef <- pdftools::pdf_data("data/WEF_TheGlobalCompetitivenessReport2019.pdf")

wef_tbl <- wef[[15]] # The Global Competitiveness Index 4.0 2019 Rankings

## #1 ~ #47 국가
wef_tbl_01 <- wef_tbl %>% 
  filter(x > 82, x < 220, y > 140, y < 720)

wef_tbl_01 <- wef_tbl_01 %>% 
  mutate(x = round(x/5),     #reduce resolution to minimise inconsistent coordinates
         y = round(y/5)) %>% 
  arrange(y, x) %>%                        #sort in reading order
  mutate(group = cumsum(!lag(space, default = 0))) %>%  #identify text with spaces and paste
  group_by(group) %>% 
  summarise(x = first(x),
            y = first(y),
            text = paste(text, collapse = " ")) %>% 
  group_by(y) %>% 
  mutate(colno = row_number()) %>%         #add column numbers for table data 
  ungroup() %>% 
  select(text, colno, y) %>% 
  pivot_wider(names_from = colno, values_from = text) %>% #pivot into table format
  select(-y) %>% 
  mutate(rank = row_number()) %>% 
  set_names(c("country", "score", "rank_change", "score_change", "rank"))

## #48 ~ #94 국가
wef_tbl_02 <- wef_tbl %>% 
  filter(x > 246, x < 380, y > 138, y < 720)

wef_tbl_02 <- wef_tbl_02 %>% 
  mutate(x = round(x/5),     #reduce resolution to minimise inconsistent coordinates
         y = round(y/5)) %>% 
  arrange(y, x) %>%                        #sort in reading order
  mutate(group = cumsum(!lag(space, default = 0))) %>%  #identify text with spaces and paste
  group_by(group) %>% 
  summarise(x = first(x),
            y = first(y),
            text = paste(text, collapse = " ")) %>% 
  group_by(y) %>% 
  mutate(colno = row_number()) %>%         #add column numbers for table data 
  ungroup() %>% 
  select(text, colno, y) %>% 
  pivot_wider(names_from = colno, values_from = text) %>% #pivot into table format
  select(-y) %>% 
  mutate(rank = 47 + row_number()) %>% 
  set_names(c("country", "score", "rank_change", "score_change", "rank"))

wef_top_country <- wef_tbl_01 %>% 
  bind_rows(wef_tbl_02) %>% 
  mutate(score = parse_number(score)) %>% 
  select(wef_rank = rank, country, wef_score = score) %>% 
  mutate(iso3c = countrycode(country, origin = 'country.name', 'iso3c'))

wef_top_country
# A tibble: 93 x 4
   wef_rank country        wef_score iso3c
      <dbl> <chr>              <dbl> <chr>
 1        1 United States       83.7 USA  
 2        2 Hong Kong SAR       83.1 HKG  
 3        3 Netherlands         82.4 NLD  
 4        4 Switzerland         82.3 CHE  
 5        5 Japan               82.3 JPN  
 6        6 Germany             81.8 DEU  
 7        7 Sweden              81.2 SWE  
 8        8 United Kingdom      81.2 GBR  
 9        9 Denmark             81.2 DNK  
10       10 Finland             80.2 FIN  
# ... with 83 more rows

3.2 IMD Digital Competitiveness

마찬가지 방법으로 The 2020 IMD World Digital Competitiveness Ranking 웹사이트에서 The 2020 IMD World Digital Competitiveness Ranking 보고서를 받아 필요한 데이터를 추출하도록 한다.

library(tabulizer)

# locate_areas("data/digital_2020.pdf", pages = 24, copy = TRUE)

## USA ~ Poland 국가
digital_01 <- extract_tables("data/digital_2020.pdf", pages = 24, guess = FALSE, method = "decide", output = "data.frame",
                          area = list(c(155.45691, 44.26341, 590.53436, 297.63800)), encoding = "LATIN-1") %>% 
  .[[1]] %>% 
  set_names(c("country_econ", "z1", "z2", "2019")) %>% 
  select(country_econ, z1, `2019`) %>% 
  mutate(country = str_extract(country_econ, "[a-zA-Z \\.]*"),
         digital_rank = str_extract(country_econ, "[0-9]{1,2}")) %>% 
  select(country, digital_rank)
  
## Spain ~ Venezuela 국가
digital_02 <- extract_tables("data/digital_2020.pdf", pages = 24, guess = FALSE, output = "data.frame",
                          area = list(c(155.4569, 300.6664, 574.3830, 554.0410 ))) %>% 
  .[[1]] %>%
  set_names(c("country_econ", "z1", "z2")) %>% 
  mutate(country = str_extract(country_econ, "[a-zA-Z \\.]*"),
         digital_rank = str_extract(country_econ, "[0-9]{1,2}")) %>% 
  select(country, digital_rank)

digital_df <- bind_rows(digital_01, digital_02) %>% 
  mutate(country = ifelse(str_detect(country, "Kong SAR"),"Hong Kong", country),
         country = ifelse(str_detect(country, "Sdwitzerland"),"Switzerland", country),
         country = ifelse(str_detect(country, "Nsetherlands"),"Netherlands", country),
         country = ifelse(str_detect(country, "K.orea Rep."),"Korea Rep.", country),
         country = ifelse(str_detect(country, "Ugndiotemd Kingdom"),"United Kingdom", country),
         country = ifelse(str_detect(country, "Nnedw Zealand"),"New Zealand", country),
         country = ifelse(str_detect(country, "Lrguxembourg"),"Luxembourg", country),
         country = ifelse(str_detect(country, "Saudi ArabSiaudi Arabia"),"Saudi Arabia", country),
         country = ifelse(str_detect(country, "hTianiawan"),"Taiwan", country),
         country = ifelse(str_detect(country, "South AfricSaouth Africa"),"South Africa", country)) %>% 
  mutate(iso3c = countrycode(country, origin = 'country.name', 'iso3c')) %>% 
  select(iso3c, digital_rank) %>% 
  mutate(digital_rank = parse_number(digital_rank)) %>% 
  as_tibble()

digital_df
# A tibble: 63 x 2
   iso3c digital_rank
   <chr>        <dbl>
 1 USA              1
 2 SGP              2
 3 DNK              3
 4 SWE              4
 5 HKG              5
 6 CHE              6
 7 NLD              7
 8 KOR              8
 9 NOR              9
10 FIN             10
# ... with 53 more rows

4 OECD 통합 데이터

마지막 단계로 위키백과사전 OECD 국가별 중요 지표와 디지털 경쟁력을 담고 있는 World Economic Forum(WEF) 글로벌 경쟁력, IMD 디지털 경쟁력 지표를 결합시켜 후속 작업을 위한 데이터로 준비한다.

library(fuzzyjoin)

oecd_full_df <- oecd_fact_df %>%
  inner_join(wef_top_country, by="iso3c") %>% 
  select(country = country.x, everything(), -country.y) %>% 
  left_join(digital_df, by = "iso3c") %>% 
  select(-wef_score)

oecd_full_df 
# A tibble: 36 x 17
   country flags iso3c   area population     gdp gdp_per_capita income_inequali~
   <chr>   <chr> <chr>  <dbl>      <dbl>   <dbl>          <dbl>            <dbl>
 1 Austra~ "\U0~ AUS   7.74e6   24598933 1.19e12          48460             34.7
 2 Austria "\U0~ AUT   8.39e4    8809212 4.62e11          52398             30.5
 3 Belgium "\U0~ BEL   3.05e4   11372068 5.44e11          47840             27.7
 4 Canada  "\U0~ CAN   9.98e6   36708083 1.71e12          46705             34  
 5 Chile   "\U0~ CHL   7.56e5   18054726 4.45e11          24635             47.7
 6 Colomb~ "\U0~ COL   1.14e6   48901066 7.09e11          14507             49.7
 7 Czech ~ "\U0~ CZE   7.89e4   10591323 3.85e11          36327             25.9
 8 Denmark "\U0~ DNK   4.29e4    5769603 2.96e11          51364             28.2
 9 Estonia "\U0~ EST   4.52e4    1315480 4.18e10          31742             32.7
10 Finland "\U0~ FIN   3.38e5    5511303 2.47e11          44866             27.1
# ... with 26 more rows, and 9 more variables: hdi <dbl>, fsi <dbl>, cpi <dbl>,
#   ief <dbl>, gpi <dbl>, wpfi <dbl>, di <dbl>, wef_rank <dbl>,
#   digital_rank <dbl>

작업한 데이터를 국가별로 정리하여 로컬 파일로 저장시킨다.

oecd_full_df %>% 
  write_rds("data/oecd_full_df.rds")
 

데이터 과학자 이광춘 저작

kwangchun.lee.7@gmail.com